{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d5f14171",
   "metadata": {},
   "source": [
    "### P053 文本数据 - CountVectorizer向量化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2d11d14d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "72a59c86",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [\n",
    "    'python is a programming language',\n",
    "    'python is popular',\n",
    "    'programming in python',\n",
    "    'object-oriented programming in python'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "172b1213",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "88fe5266",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<4x8 sparse matrix of type '<class 'numpy.int64'>'\n",
       "\twith 15 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.fit_transform(documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e6378bc5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>in</th>\n",
       "      <th>is</th>\n",
       "      <th>language</th>\n",
       "      <th>object</th>\n",
       "      <th>oriented</th>\n",
       "      <th>popular</th>\n",
       "      <th>programming</th>\n",
       "      <th>python</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   in  is  language  object  oriented  popular  programming  python\n",
       "0   0   1         1       0         0        0            1       1\n",
       "1   0   1         0       0         0        1            0       1\n",
       "2   1   0         0       0         0        0            1       1\n",
       "3   1   0         0       1         1        0            1       1"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    vectorizer.fit_transform(documents).toarray(),\n",
    "    columns = vectorizer.get_feature_names()\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "124e1b44",
   "metadata": {},
   "source": [
    "### P054 文本数据 - 计数向量化并配置停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "987e20b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(stop_words='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "14b2d070",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>language</th>\n",
       "      <th>object</th>\n",
       "      <th>oriented</th>\n",
       "      <th>popular</th>\n",
       "      <th>programming</th>\n",
       "      <th>python</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   language  object  oriented  popular  programming  python\n",
       "0         1       0         0        0            1       1\n",
       "1         0       0         0        1            0       1\n",
       "2         0       0         0        0            1       1\n",
       "3         0       1         1        0            1       1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    vectorizer.fit_transform(documents).toarray(),\n",
    "    columns = vectorizer.get_feature_names()\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d89a152a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f9e1515",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77b1ca73",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "b83b807b",
   "metadata": {},
   "source": [
    "### P055 文本数据-计数向量化并配置n-gram"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "dcf5967b",
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "283f2a08",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>language</th>\n",
       "      <th>object</th>\n",
       "      <th>object oriented</th>\n",
       "      <th>oriented</th>\n",
       "      <th>oriented programming</th>\n",
       "      <th>popular</th>\n",
       "      <th>programming</th>\n",
       "      <th>programming language</th>\n",
       "      <th>programming python</th>\n",
       "      <th>python</th>\n",
       "      <th>python popular</th>\n",
       "      <th>python programming</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   language  object  object oriented  oriented  oriented programming  popular  \\\n",
       "0         1       0                0         0                     0        0   \n",
       "1         0       0                0         0                     0        1   \n",
       "2         0       0                0         0                     0        0   \n",
       "3         0       1                1         1                     1        0   \n",
       "\n",
       "   programming  programming language  programming python  python  \\\n",
       "0            1                     1                   0       1   \n",
       "1            0                     0                   0       1   \n",
       "2            1                     0                   1       1   \n",
       "3            1                     0                   1       1   \n",
       "\n",
       "   python popular  python programming  \n",
       "0               0                   1  \n",
       "1               1                   0  \n",
       "2               0                   0  \n",
       "3               0                   0  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    vectorizer.fit_transform(documents).toarray(),\n",
    "    columns = vectorizer.get_feature_names()\n",
    ")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2472baf8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81c04da8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "319a5fd8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "681e1819",
   "metadata": {},
   "source": [
    "### P056 文本数据 - TFIDF实现文本向量化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "262796e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6addb438",
   "metadata": {},
   "outputs": [],
   "source": [
    "documents = [\n",
    "    'python is a programming language',\n",
    "    'python is popular',\n",
    "    'programming in python',\n",
    "    'object-oriented programming in python'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "972fbcd8",
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vectorizer = TfidfVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c8400175",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(\n",
    "    data = tfidf_vectorizer.fit_transform(documents).toarray(),\n",
    "    columns = tfidf_vectorizer.get_feature_names()\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "11957cc3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>in</th>\n",
       "      <th>is</th>\n",
       "      <th>language</th>\n",
       "      <th>object</th>\n",
       "      <th>oriented</th>\n",
       "      <th>popular</th>\n",
       "      <th>programming</th>\n",
       "      <th>python</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.519714</td>\n",
       "      <td>0.659191</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.420753</td>\n",
       "      <td>0.343993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.572892</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.726641</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.379192</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.691131</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.559530</td>\n",
       "      <td>0.457453</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.433919</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.550372</td>\n",
       "      <td>0.550372</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.351295</td>\n",
       "      <td>0.287207</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         in        is  language    object  oriented   popular  programming  \\\n",
       "0  0.000000  0.519714  0.659191  0.000000  0.000000  0.000000     0.420753   \n",
       "1  0.000000  0.572892  0.000000  0.000000  0.000000  0.726641     0.000000   \n",
       "2  0.691131  0.000000  0.000000  0.000000  0.000000  0.000000     0.559530   \n",
       "3  0.433919  0.000000  0.000000  0.550372  0.550372  0.000000     0.351295   \n",
       "\n",
       "     python  \n",
       "0  0.343993  \n",
       "1  0.379192  \n",
       "2  0.457453  \n",
       "3  0.287207  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06c17c01",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "4bce5db7",
   "metadata": {},
   "source": [
    "### P057 文本数据-TFIDF向量化增加停用词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a0865262",
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf_vectorizer = TfidfVectorizer(\n",
    "    stop_words=['is', 'in']\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "de75d002",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(\n",
    "    data = tfidf_vectorizer.fit_transform(documents).toarray(),\n",
    "    columns = tfidf_vectorizer.get_feature_names()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7dfd8b97",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>language</th>\n",
       "      <th>object</th>\n",
       "      <th>oriented</th>\n",
       "      <th>popular</th>\n",
       "      <th>programming</th>\n",
       "      <th>python</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.771579</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.492489</td>\n",
       "      <td>0.402642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.886548</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.462637</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.774191</td>\n",
       "      <td>0.632952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.610878</td>\n",
       "      <td>0.610878</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.389916</td>\n",
       "      <td>0.318782</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   language    object  oriented   popular  programming    python\n",
       "0  0.771579  0.000000  0.000000  0.000000     0.492489  0.402642\n",
       "1  0.000000  0.000000  0.000000  0.886548     0.000000  0.462637\n",
       "2  0.000000  0.000000  0.000000  0.000000     0.774191  0.632952\n",
       "3  0.000000  0.610878  0.610878  0.000000     0.389916  0.318782"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d93c016e",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "310c7ecb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
